{
 "metadata": {
  "name": "",
  "signature": "sha256:8085e97a331345d09d79abcf05d188bbd241f68cab4a5f4fa70e56f954131e92"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from pyfasta import Fasta\n",
      "from Bio import SeqIO\n",
      "from Bio.Seq import Seq\n",
      "from Bio.SeqRecord import SeqRecord"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 29
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Requires this file to be in the current working directory\n",
      "all_trans = Fasta(\"Homo_sapiens.GRCh38.cdna.all.fa\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 36
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Get the first `max_trans` transcripts"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "min_len = 500\n",
      "count = 0\n",
      "max_trans = 10\n",
      "trans_to_keep = {}\n",
      "for tname in all_trans.keys():\n",
      "    if count == max_trans:\n",
      "        break\n",
      "    if len(all_trans[tname]) > min_len:\n",
      "        trans_to_keep[tname] = str(all_trans[tname])\n",
      "        count += 1"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Now output them"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "trans_recs = []\n",
      "for tname in trans_to_keep.keys():\n",
      "    trans_recs.append(SeqRecord(seq = Seq(trans_to_keep[tname]),\n",
      "                                id = tname.split()[0],\n",
      "                                description = \"\"))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 33
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "out_fname = \"{0}_trans_gt_{1}_bp.fasta\".format(max_trans, min_len)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 34
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "with open(out_fname, \"w\") as out_fhandle:\n",
      "    SeqIO.write(trans_recs, out_fhandle, \"fasta\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 35
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}